rm(list = ls())

# Set working directory
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

# Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(readr, tidyverse, stringi, lubridate, survey)

# Load data
dat_pre <- read.csv("./pdeq_cps_all.csv")

dat_post <- read.csv("./pdeq_pes.csv")

# Remove testing observations (from researchers)
dat_pre <- dat_pre %>% 
  filter(DistributionChannel != "preview") 

dat_post <- dat_post %>% 
  filter(DistributionChannel != "preview")

# Remove election day responses
dat_pre <- dat_pre %>% 
  filter(EndDate < "2022-10-03")

# Remove duplicate responses
dat_pre <- dat_pre %>%
  group_by(PID) %>%
  filter(if(any(gc == 1, na.rm = T)) gc == 1 else EndDate == max(EndDate)) %>%
  filter(Progress == max(Progress)) %>%
  filter(as.numeric(`Duration..in.seconds.`) == max(as.numeric(`Duration..in.seconds.`))) %>%
  ungroup() %>% 
  filter(as.numeric(`Duration..in.seconds.`) != 0)

dat_post <- dat_post %>%
  group_by(PID) %>%
  filter(if(any(gc == 1, na.rm = T)) gc == 1 else EndDate == max(EndDate)) %>%
  filter(Progress == max(Progress)) %>%
  filter(`Duration..in.seconds.` == max(`Duration..in.seconds.`)) %>%
  ungroup() %>% 
  filter(`Duration..in.seconds.` != 0)

## Unique respondents who started the survey
length(unique(dat_pre$PID)) # 4,599
length(unique(dat_post$PID)) # 1,692

# Did not answer any question (these were the first questions)
sum(is.na(dat_pre$citizen)) #15
sum(is.na(dat_post$citizen)) #54

dat_pre <- dat_pre %>% 
  filter(!is.na(citizen)) # 4,584

dat_post <- dat_post %>% 
  filter(!is.na(gender)) # 1,635

# Not eligible
inelegible_pre <- dat_pre %>% 
  filter(noncitizen == 1 | under18==1) # 101
inelegible_post <- dat_post %>% 
  filter(noncitizen == 1 | under18==1) # 23

dat_pre <- dat_pre %>% 
  filter(!PID %in% inelegible_pre$PID) # 4,483

dat_post <- dat_post %>% 
  filter(!PID %in% inelegible_post$PID) # 1,612

# Incomplete answers
dat_pre_incomplete <- dat_pre %>% 
  filter(gc != 1 | is.na(gc))
dat_post_incomplete <- dat_post %>% 
  filter(gc != 1 | is.na(gc))

### Keep only completes

dat_pre <- dat_pre %>% 
  filter(gc == 1)

dat_post <- dat_post %>% 
  filter(gc == 1)

### Recode variables for weights

dat_pre <- dat_pre %>% 
  mutate(age_group = case_when(
    age >= 18 & age < 25 ~ 1,
    age >= 25 & age < 35 ~ 2,
    age >= 35 & age < 45 ~ 3,
    age >= 45 & age < 55 ~ 4,
    age >= 55 & age < 65 ~ 5,
    age >= 65 ~ 6
    ),
    gender_group = case_when(
      gender == 1 ~ "male",
      gender == 2 ~ "female",
      TRUE ~ NA_character_
    ))

dat_post <- dat_post %>% 
  mutate(age_group = case_when(
    age >= 18 & age < 25 ~ 1,
    age >= 25 & age < 35 ~ 2,
    age >= 35 & age < 45 ~ 3,
    age >= 45 & age < 55 ~ 4,
    age >= 55 & age < 65 ~ 5,
    age >= 65 ~ 6
  ),
  gender_group = case_when(
    gender == 1 ~ "male",
    gender == 2 ~ "female",
    TRUE ~ NA_character_
  ))

### Weights pre

dat_wt_pre <- dat_pre %>% 
  dplyr::select(region, age_group, gender_group, PID) %>% na.omit()

dat_nowt_pre <- svydesign(ids =~ 1, data = dat_wt_pre, weights = NULL)

gender_dist <- tibble(gender_group = c("female", "male"), 
                      Freq = nrow(dat_wt_pre)*c(.512, .488)) # props .513 and .488 sum to 1.001, let's put .512 and .488

age_dist <- tibble(age_group = c("1", # .102
                                 "2", # .154
                                 "3", # .16
                                 "4", # .174
                                 "5", # .183
                                 "6"),  # .228
                   Freq = nrow(dat_wt_pre) * c(.102, .154, .16, .174, .183, .227)) # props sum to 1.001 so let's put .227 for 65+

region_dist <- tibble(region = c("1", # .018
                                 "2", # .025
                                 "3", #.091
                                 "4", # .03
                                 "5", # .052
                                 "6", # .011
                                 "7", # .039
                                 "8", # .012
                                 "9", # .06
                                 "10", # .072
                                 "11", # .051
                                 "12", # .034
                                 "13", # .182
                                 "14", # .24
                                 "15", # .005
                                 "16", # .046
                                 "17"), # .034
                      Freq = nrow(dat_wt_pre) * c(.018, .025, .091, .03, .052, .011, .039, .012, .06, .072,
                                            .051, .034, .182, .238, .005, .046, .034)) # props sum to 1.002 so let's
# adjust Mtl

# weight the data by the target distributions
rake <- rake(design = dat_nowt_pre,
             sample.margins = list(~gender_group, ~age_group, ~region),
             population.margins = list(gender_dist, age_dist, region_dist))
# rake <- trimWeights(rake, lower = 0.1, upper = 8, strict = TRUE)
dat_wt_pre$wt <- weights(rake)

# merging weights with full data
dat_pre <- merge(dat_pre, dat_wt_pre, all = TRUE, by = c("PID", "region", "age_group", "gender_group"))
dat_pre <- dat_pre %>% 
  mutate(wt = ifelse(is.na(wt), 1, wt))

### Weights post

colnames(dat_pre) <- paste0(colnames(dat_pre), "_pre")
dat_pre$PID <- dat_pre$PID_pre

dat_prepost <- left_join(dat_pre, dat_post, by = "PID")
dat_post <- dat_prepost %>% 
  filter(completed == 1)
dat_post$region <- dat_post$region_pre

dat_post <- dat_post[,!grepl("_pre$",names(dat_post))] 

dat_wt_post <- dat_post %>% 
  dplyr::select(region, age_group, gender_group, PID) %>% na.omit()

dat_nowt_post <- svydesign(ids =~ 1, data = dat_wt_post, weights = NULL)

gender_dist <- tibble(gender_group = c("female", "male"), 
                      Freq = nrow(dat_wt_post)*c(.512, .488)) # props .513 and .488 sum to 1.001, let's put .512 and .488

age_dist <- tibble(age_group = c("1", # .102
                                 "2", # .154
                                 "3", # .16
                                 "4", # .174
                                 "5", # .183
                                 "6"),  # .228
                   Freq = nrow(dat_wt_post) * c(.102, .154, .16, .174, .183, .227)) # props sum to 1.001 so let's put .227 for 65+

region_dist <- tibble(region = c("1", # .018
                                 "2", # .025
                                 "3", #.091
                                 "4", # .03
                                 "5", # .052
                                 "6", # .011
                                 "7", # .039
                                 "8", # .012
                                 "9", # .06
                                 "10", # .072
                                 "11", # .051
                                 "12", # .034
                                 "13", # .182
                                 "14", # .24
                                 "15", # .005
                                 "16", # .046
                                 "17"), # .034
                      Freq = nrow(dat_wt_post) * c(.018, .025, .091, .03, .052, .011, .039, .012, .06, .072,
                                                  .051, .034, .182, .238, .005, .046, .034)) # props sum to 1.002 so let's
# adjust Mtl

# weight the data by the target distributions
rake <- rake(design = dat_nowt_post,
             sample.margins = list(~gender_group, ~age_group, ~region),
             population.margins = list(gender_dist, age_dist, region_dist))
# rake <- trimWeights(rake, lower = 0.1, upper = 8, strict = TRUE)
dat_wt_post$wt <- weights(rake)

# merging weights with full data
dat_post <- merge(dat_post, dat_wt_post, all = TRUE, by = c("PID", "region", "age_group", "gender_group"))
dat_post <- dat_post %>% 
  mutate(wt = ifelse(is.na(wt), 1, wt))

### Merge datasets

colnames(dat_post) <- paste0(colnames(dat_post), "_post")
dat_post$PID <- dat_post$PID_post

dat <- left_join(dat_pre, dat_post, by = "PID")

sum(!is.na(dat_pre$wt_pre))
sum(!is.na(dat_post$wt_post))

test <- as.data.frame(table(dat$PID))
test <- filter(test, Freq!= 1)
test2 <- filter(dat, PID %in% test$Var1)
dat <- filter(dat, !PID %in% test$Var1)

## Remove speeders who failed an attention check
# Identify speeders and those who failed an attention check (pre-election)
speeders_inattentive_pre <- dat %>% 
  filter(as.numeric(`Duration..in.seconds._pre`) < (median(as.numeric(`Duration..in.seconds._pre`), na.rm = T)/3) & issues_agree_5_pre != 3)

# Identify speeders and those who failed an attention check (post-election)
speeders_inattentive_post <- dat %>% 
  filter(as.numeric(`Duration..in.seconds._post`) < (median(as.numeric(`Duration..in.seconds._post`), na.rm = T)/3) & attentioncheck_post != 3)

# Remove speeders and those who failed an attention check
datqc <- dat %>% 
  filter(!PID %in% speeders_inattentive_pre$PID & !PID %in% speeders_inattentive_post$PID)

both_speeders = dat %>% 
  filter(PID %in% speeders_inattentive_pre$PID & PID %in% speeders_inattentive_post$PID) 
nrow(both_speeders) # 2

datqc <- datqc %>%
  select(
    PID,                        # Participant ID
    Duration..in.seconds._pre,  # Pre-election survey duration
    issues_agree_5_pre,         # Attention check pre
    Duration..in.seconds._post, # Post-election survey duration
    attentioncheck_post,        # Attention check post
    completed_post,             # Completion status post
    influence_1_post,           # Perceptions that fraud influenced the outcome post
    trust_accuracy_3_post,      # Trust in results post
    trust_accuracy_2_post,      # Counted accurately post
    fair_election_post,         # Elections Quebec administered fairly
    legitimate_winner_1_pre,    # Trudeau legitimacy pre
    legitimate_winner_2_pre,    # Biden legitimacy pre
    age_pre,                    # Age pre
    education_pre,              # Education pre
    gender_group_pre,           # Gender pre
    region.1_pre,               # Region pre
    wt_pre,                     # Pre-election survey weight
    wt_post,                    # Post-election survey weight
    ideol_1_pre                 # Ideology pre
  )

# Save the dataset
write.csv(datqc, file = "./qemp.csv", row.names = FALSE)